Source Code of org.terrier.structures.indexing.singlepass.BlockInverted2DirectIndexBuilder

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is BlockInverted2DirectIndexBuilder.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald (craigm{at}dcs.gla.ac.uk)
 */
package org.terrier.structures.indexing.singlepass;


import java.io.IOException;
import java.util.Arrays;


import org.terrier.structures.BlockDirectIndex;
import org.terrier.structures.BlockDirectIndexInputStream;
import org.terrier.structures.Index;
import org.terrier.structures.InvertedIndexInputStream;
import org.terrier.structures.postings.BlockFieldIterablePosting;
import org.terrier.structures.postings.BlockIterablePosting;
import org.terrier.utility.ApplicationSetup;




/** Create a block direct index from a BlockInvertedIndex.
  * <p><b>Properties:</b> 
  * <ol>
  * <li><tt>inverted2direct.processtokens</tt> - total number of tokens to attempt each iteration. Defaults to 50000000. Memory usage would more likely
  * be linked to the number of pointers and the number of blocks, however as the document index does not contain these statistics on a document basis.
  * these are impossible to estimate. Note that the default is less than Inverted2DirectIndexBuilder.</li>
  * </ol>
  * @author Craig Macdonald
    * @since 2.0 */
public class BlockInverted2DirectIndexBuilder extends Inverted2DirectIndexBuilder {
  /**
   * constructor
   * @param i
   */
  public BlockInverted2DirectIndexBuilder(Index i)
  {
    super(i);
    directIndexClass = BlockDirectIndex.class.getName();
      directIndexInputStreamClass = BlockDirectIndexInputStream.class.getName();
      basicDirectIndexPostingIteratorClass = BlockIterablePosting.class.getName();
      fieldDirectIndexPostingIteratorClass = BlockFieldIterablePosting.class.getName();
    processTokens = Long.parseLong(ApplicationSetup.getProperty("inverted2direct.processtokens", "10000000"));
  }


    /** get an array of posting object of the specified size. These will be used to hold
      * the postings for a range of documents */
    protected Posting[] getPostings(final int count)
    {
        Posting[] rtr = new Posting[count];
        if (saveTagInformation)
        {
            for(int i=0;i<count;i++)
                rtr[i] = new BlockFieldPosting();
        }
        else
        {
            for(int i=0;i<count;i++)
                rtr[i] = new BlockPosting();
        }
        return rtr;
    }


    /** returns the SPIR implementation that should be used for reading the postings
      * written earlier */
    protected PostingInRun getPostingReader()
    {
        if (saveTagInformation)
        {
            return new BlockFieldPostingInRun(fieldCount);
        }
        return new BlockPostingInRun();
    }
  
  /** traverse the inverted file, looking for all occurrences of documents in the given range */
    protected long traverseInvertedFile(final InvertedIndexInputStream iiis, int firstDocid, int lastDocid, final Posting[] directPostings)
        throws IOException
    {
        //foreach posting list in the inverted index
            //for each (in range) posting in list
                //add termid->tf tuple to the Posting array
    long tokens = 0;
        int[][] postings;
        int termId = -1;
        //array recording which of the current set of documents has had any postings written thus far
        boolean[] prevUse = new boolean[lastDocid - firstDocid + 1];
        Arrays.fill(prevUse, false);
        final int[] fieldFs = new int[fieldCount];
        
        while((postings = iiis.getNextDocuments()) != null)
        {
            termId++;
            final int[] postings_docids = postings[0];
            final int[] postings_freqs = postings[1];
            final int[] postings_blockfreqs = postings[fieldCount + 2];
            final int[] postings_blockids = postings[fieldCount + 3];
            int startOffset = Arrays.binarySearch(postings_docids, firstDocid);
            int endOffset = Arrays.binarySearch(postings_docids, lastDocid+1);
            if (startOffset < 0)
                startOffset = -(startOffset+1);
            //no documents in range for this term
            if (startOffset == postings_docids.length)
                continue;
            if (endOffset < 0)
                endOffset = -(endOffset+1);
            if (endOffset == 0)
                continue;
      int blockIndex = 0;
            if (startOffset != 0)
                for(int i=0;i<startOffset;i++)
                    blockIndex += postings_blockfreqs[i];
            for(int offset = startOffset; offset<endOffset;offset++)
            {
                if (postings_docids[offset] >= firstDocid && postings_docids[offset] <= lastDocid)
                {
                    final int writerOffset = postings_docids[offset] - firstDocid;
          tokens += postings_freqs[offset];
          final int[] blocks = new int[postings_blockfreqs[offset]];
          //System.err.println("Term has "+ postings4.length + " blocks, of which we are at offset "+ blockIndex + ", and this posting has "+ postings3[offset] + " blocks");
          /* TODO by adding offset/length parameters to the insert/writeFirstDoc methods of BlockPosting
           * and BlockFieldPosting, this arraycopy could be prevented */
          System.arraycopy(postings_blockids, blockIndex, blocks, 0, postings_blockfreqs[offset]);
                    if (prevUse[writerOffset])
                    {
                      if (saveTagInformation)
            {
              for(int fi=0;fi< fieldCount;fi++)
                fieldFs[fi] = postings[2+fi][offset];
              ((BlockFieldPosting)directPostings[writerOffset]).insert(termId, postings_freqs[offset],  fieldFs, blocks);
            }
            else
                            ((BlockPosting)directPostings[writerOffset]).insert(termId, postings_freqs[offset], blocks);
                    }
                    else
                    {
                        prevUse[writerOffset] = true;
                        if (saveTagInformation)
            {
              for(int fi=0;fi< fieldCount;fi++)
                fieldFs[fi] = postings[2+fi][offset];
              ((BlockFieldPosting)directPostings[writerOffset]).writeFirstDoc(termId, postings_freqs[offset],  fieldFs, blocks);
            }
            else
                            ((BlockPosting)directPostings[writerOffset]).writeFirstDoc(termId, postings_freqs[offset], blocks);
                    }
                }
        blockIndex+= postings_blockfreqs[offset];
            }
        }
    return tokens;
    }


  /**
   * main
   * @param args
   * @throws Exception
   */
  public static void main (String[] args) throws Exception
  {
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index i = Index.createIndex();
    if (i== null)
    {
      System.err.println("Sorry, no index could be found in default location");
      return;
    }
    new BlockInverted2DirectIndexBuilder(i).createDirectIndex();
    i.close();
  }


}
Source Code of org.terrier.structures.indexing.singlepass.BlockInverted2DirectIndexBuilder

Related Classes of org.terrier.structures.indexing.singlepass.BlockInverted2DirectIndexBuilder